import java.net.URL;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
/*
import java.net.Proxy;
import java.net.InetSocketAddress;
import java.net.InetAddress;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
*/
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.IOException;

import java.io.File;
import java.io.FileWriter;
import javax.imageio.ImageIO;
import java.io.PrintStream;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.HashMap;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


public class MainSpider {
	PrintStream out = System.out;
	
	public static void main(String args[]) {
		try {
		MainSpider spider = new MainSpider();
		String url = "https://www.ixiupet.com/gougou/ggmr/22721.html";
		
		FileWriter fw = new FileWriter("E:\\temp\\essay.csv");
		spider.wxmlForIxiu(url,fw);
		}catch(Exception e) {e.printStackTrace();}
	}
	void wxmlForIxiu(String url,FileWriter fw) {
		ArrayList <String> result = new ArrayList();
		List <String> filed = Arrays.asList("pet","type","title","cover","description","time","origin","writer");
		try {	
			
			String html = InputStream2String(new InputStreamReader(getInfo(url),"gbk"));
			Document doc = Jsoup.parse(html);
			String cover="";
			String description = "";
			// select("*") make the Elements become iterable
			Elements article = doc.getElementsByClass("inner").select("div.article-content");
			ArrayList <String> content = new ArrayList();
			for(Element a :article) {
				if(cover=="") cover = a.select("img").attr("src");
				if(description=="") description = a.select("p").text()
						.replaceAll("<a href=(.*) target=\"_blank\">", "")
						.replaceAll("<u>", "")
						.replaceAll("</u>", "")
						.replaceAll("</a>", "");
				a.select("img").attr("style", "width:100%;");
				a.select("p").attr("style","margin:2px 8px 8px 8px;text-align:justify");
				a.select("strong").attr("style","margin-top:4px;margin-bottom:1px");
				out.println(a.html());
				
			}
			Elements info  = doc.getElementsByClass("listltitle");
			result.add("\n"+"dog");
			result.add("晒宠");
			result.add(info.select("h1").text());
			result.add(cover);
			result.add(description.replace(" ",""));
			result.add(info.select("span.spanimg3").text());
			result.add("宠物网(ixiupet.com)");
			result.add(info.select("span.spanimg1").text().split("：")[1]);
			
			fw.write(String.join(",",filed));
			fw.write(String.join(",",result));
			fw.close();
			
		}catch(Exception e) {
			e.printStackTrace();
		}
	}
	
	public  void saveImages(String url,String path,int pageNum) {
		try {
				
			for (int i=1;i<pageNum;i++) {	
				//InputStreamReader response = new InputStreamReader(getInfo(url+"list_9_"+i+".html",agent.getProxy()),"gbk");
				InputStreamReader response = new InputStreamReader(getInfo(url+"list_9_"+i+".html"),"gbk");
				System.out.printf(url+"list_9_"+i+".html");
				for (Element petList : parserImg(response)) {
					//InputStream img = getInfo(petList.attr("src"),agent.getProxy());
					InputStream img = getInfo(petList.attr("src"));
					String fileName = petList.attr("alt")
							.replace("/", "-")
							.replace("|", "-")
							.replace("\\", "-");
					System.out.printf(petList.attr("src")," "+fileName);
					File file = new File(path+fileName+".jpg");
					file.createNewFile();
					ImageIO.write(ImageIO.read(img),"jpg",file);
					
				}
				System.out.println("第"+i+"页保存完毕");
			}
			
		}catch (Exception e) {
			e.printStackTrace();
		}

	}
	
	String InputStream2String(InputStreamReader in) {
		BufferedReader reader = new BufferedReader(in);
		StringBuffer sb = new StringBuffer();
		String line;
		try {
			while((line=reader.readLine()) != null) {
				sb.append(line);
			}
			reader.close();
		}catch(IOException e) {e.printStackTrace();}
		return sb.toString();
	}
	
	Elements parserImg(InputStreamReader in) throws IOException{
		Document doc = Jsoup.parse(InputStream2String(in));
		Elements petList = doc.getElementsByClass("news-main bg").select("div.tiyan-bd-sml").select("img[src]");
		in.close();
		return petList;
	}
	
	ArrayList <String> parserText(InputStreamReader in) {
		ArrayList<String> info = new ArrayList();
		Document doc = Jsoup.parse(InputStream2String(in));
		Elements content = doc.getElementsByClass("c1text3");

		for(Element c : content.select("a")) {
			info.add(c.text().replaceAll(",", "|"));
		}
		
		Elements rate = doc.getElementsByClass("pingjialist");
		for (Element r: rate.select("div")) {
			try {
				String str = r.attr("class").split("start")[1];
				if (!str.contains(",")) info.add(str);
				else info.add("0");
			}catch(ArrayIndexOutOfBoundsException e) {
				info.add("0");
			}
		};
		return info;
	}
	
	void CsvForBreed(String url,FileWriter writer,int pageNum) {	
		try {
			ArrayList <String> info = new ArrayList();
			for (int p=1;p<pageNum;p++) {
				Document doc = Jsoup.parse(InputStream2String(
						new InputStreamReader(
								getInfo(url+"list_9_"+p+".html"),
								"gbk")));
				Elements petList = doc
						.getElementsByClass("tiyan-smll-det");
						
				for (Element pet:petList) {
					out.println(pet.child(0).attr("href"));
					info = parserText(
							new InputStreamReader(
									getInfo(pet.child(0).attr("href")),
									"gbk"));
					String pic = pet.child(0).attr("title")
							.replace("/", "-")
							.replace("|", "-")
							.replace("\\", "-");
					info.add("cloud://pethub-database.7065-pethub-database-1301811252/cat/"+pic+".jpg");
					info.add("\n");
					writer.append(String.join(",", info));
					
				}
				info.clear();
				out.println("保存好了第"+p+"页");
			}
		}catch(Exception e) {e.printStackTrace();}
		
	}
	
	public  InputStream getInfo(String url) throws MalformedURLException,IOException{
		
		URL link = new URL(url);
		// 没找到好的ip代理暂时不用
		//String [] ip = p.split(":");
		//HttpURLConnection conn = (HttpURLConnection) link.openConnection(proxy);
		HttpURLConnection conn = (HttpURLConnection) link.openConnection();
		conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Mobile Safari/537.36");
		conn.setRequestProperty("Connection", "keep-alive");
		conn.setRequestProperty("Accept", "image/webp,image/apng,image/*,*/*;q=0.8");
		conn.setRequestProperty("Host","www.ixiupet.com");
		conn.setConnectTimeout(3000);
		try {
			conn.connect();
			Thread.sleep(1500);
		}catch(Exception e) {
			// 没找到好的ip代理暂时不用
			//agent.removeProxy(p);
			//getInfo(url,agent.getProxy());
		}
		
		return conn.getInputStream();
		
	}
}
//GetProxy agent= new GetProxy();
	/*
	class GetProxy {
		List <String> proxyList;
		Random random = new Random();
		GetProxy(){
			this.proxyList= new LinkedList <String>(Arrays.asList(
					"121.69.26.14:8080",
					"132.145.89.166:3128"));
		}
	
		void removeProxy(String p){
			this.proxyList.remove(this.proxyList.indexOf(p));
		}
		
		String getProxy() {
			//return proxyList.get(random.nextInt(proxyList.size()));
			
			String [] ip = proxyList.get(random.nextInt(proxyList.size())).split(":");
			System.out.println(ip[0]+ip[1]);
			return new Proxy(Proxy.Type.HTTP,
				new InetSocketAddress(ip[0],Integer.valueOf(ip[1])));
		}
	}*/	
